{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Loopy: Transforming a PDE to Code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "import pymbolic.primitives as p\n",
    "\n",
    "u = p.Variable(\"u\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We'll write code that evaluates $\\triangle u$ using finite differences.\n",
    "\n",
    "To that end, we define a new expression 'thing': An operator for the Laplacian."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Sum((Laplacian(Variable('u')), Power(Variable('u'), 2), -1))"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "class Laplacian(p.Expression):\n",
    "    def __init__(self, child):\n",
    "        self.child = child\n",
    "        \n",
    "    def __getinitargs__(self):\n",
    "        return (self.child,)\n",
    "    \n",
    "    mapper_method = \"map_laplacian\"\n",
    "        \n",
    "pde = Laplacian(u)+u**2-1\n",
    "pde"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we'll write code to turn Laplacians into their discrete finite difference forms, using `i` and `j` as formal indices, using\n",
    "\n",
    "$$f''(x) \\approx \\frac{f(x+h) - 2 f(x) + f(x-h)}{h^{2}}$$\n",
    "\n",
    "Pay close attention to indexing!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "from pymbolic.mapper import IdentityMapper\n",
    "\n",
    "i = p.Variable(\"i\")\n",
    "j = p.Variable(\"j\")\n",
    "\n",
    "ii = i+1\n",
    "jj = j+1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "#clear\n",
    "class FDMapper(IdentityMapper):\n",
    "    def map_variable(self, expr):\n",
    "        return expr[ii, jj]\n",
    "\n",
    "    def map_laplacian(self, expr):\n",
    "        var = expr.child\n",
    "        return (-4*var[ii,jj] + var[ii+1,jj] + var[ii-1,jj]\n",
    "                + var[ii,jj+1] + var[ii,jj-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]\n"
     ]
    }
   ],
   "source": [
    "fd_mapper = FDMapper()\n",
    "print(fd_mapper(pde))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----\n",
    "\n",
    "Now let's generate some code for this, using `loopy`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "import loopy as lp\n",
    "from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2\n",
    "\n",
    "result = p.Variable(\"result\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Observe the two parts of the `loopy` kernel description:\n",
    "\n",
    "* Polyhedral loop domain\n",
    "* Instructions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "#clear\n",
    "knl = lp.make_kernel(\n",
    "    \"{[i,j]: 0<=i,j<n}\",\n",
    "    [lp.Assignment(result[ii, jj], fd_mapper(pde))],\n",
    "    )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Kernels can always be inspected--simply use `print`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "---------------------------------------------------------------------------\n",
      "KERNEL: loopy_kernel\n",
      "---------------------------------------------------------------------------\n",
      "ARGUMENTS:\n",
      "n: ValueArg, type: <auto/runtime>\n",
      "result: type: <auto/runtime>, shape: (1 + n, 1 + n), dim_tags: (N1:stride:1 + n, N0:stride:1) aspace: global\n",
      "u: type: <auto/runtime>, shape: (2 + n, 2 + n), dim_tags: (N1:stride:2 + n, N0:stride:1) aspace: global\n",
      "---------------------------------------------------------------------------\n",
      "DOMAINS:\n",
      "[n] -> { [i, j] : 0 <= i < n and 0 <= j < n }\n",
      "---------------------------------------------------------------------------\n",
      "INAME IMPLEMENTATION TAGS:\n",
      "i: None\n",
      "j: None\n",
      "---------------------------------------------------------------------------\n",
      "INSTRUCTIONS:\n",
      "for j, i\n",
      "    \u001b[36mresult[i + 1, j + 1]\u001b[0m = \u001b[35mu[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]\u001b[0m  {id=\u001b[32minsn\u001b[0m}\n",
      "end j, i\n",
      "---------------------------------------------------------------------------\n"
     ]
    }
   ],
   "source": [
    "print(knl)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----\n",
    "\n",
    "Let's move towards running this code. To do so, we need `pyopencl`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pyopencl as cl\n",
    "import pyopencl.array\n",
    "import pyopencl.clrandom\n",
    "\n",
    "ctx = cl.create_some_context()\n",
    "queue = cl.CommandQueue(ctx)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And some data to work with:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "n = 1000\n",
    "u = cl.clrandom.rand(queue, (n+2,n+2), dtype=np.float32)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now run the code, and tell loopy to print what it generates:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\n",
      "__kernel \u001b[36mvoid\u001b[39;49;00m \u001b[32m__attribute__\u001b[39;49;00m ((reqd_work_group_size(\u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(\u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m n, __global \u001b[36mfloat\u001b[39;49;00m *__restrict__ result, __global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ u)\n",
      "{\n",
      "  \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m j = \u001b[34m0\u001b[39;49;00m; j <= -\u001b[34m1\u001b[39;49;00m + n; ++j)\n",
      "    \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m i = \u001b[34m0\u001b[39;49;00m; i <= -\u001b[34m1\u001b[39;49;00m + n; ++i)\n",
      "      result[(\u001b[34m1\u001b[39;49;00m + n) * (\u001b[34m1\u001b[39;49;00m + i) + \u001b[34m1\u001b[39;49;00m + j] = u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m1\u001b[39;49;00m + i) + \u001b[34m1\u001b[39;49;00m + j] * u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m1\u001b[39;49;00m + i) + \u001b[34m1\u001b[39;49;00m + j] + -\u001b[34m1.0f\u001b[39;49;00m + -\u001b[34m4.0f\u001b[39;49;00m * u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m1\u001b[39;49;00m + i) + \u001b[34m1\u001b[39;49;00m + j] + u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m2\u001b[39;49;00m + i) + \u001b[34m1\u001b[39;49;00m + j] + u[(\u001b[34m2\u001b[39;49;00m + n) * i + \u001b[34m1\u001b[39;49;00m + j] + u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m1\u001b[39;49;00m + i) + \u001b[34m2\u001b[39;49;00m + j] + u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m1\u001b[39;49;00m + i) + j];\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "knl = lp.set_options(knl, write_cl=True)\n",
    "\n",
    "evt, (result,) = knl(queue, u=u, n=n)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "That's obviously not very parallel. Introduce parallelism:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\n",
      "__kernel \u001b[36mvoid\u001b[39;49;00m \u001b[32m__attribute__\u001b[39;49;00m ((reqd_work_group_size(\u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(\u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m n, __global \u001b[36mfloat\u001b[39;49;00m *__restrict__ result, __global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ u)\n",
      "{\n",
      "  result[(\u001b[34m1\u001b[39;49;00m + n) * (\u001b[34m1\u001b[39;49;00m + gid(\u001b[34m0\u001b[39;49;00m)) + \u001b[34m1\u001b[39;49;00m + gid(\u001b[34m1\u001b[39;49;00m)] = u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m1\u001b[39;49;00m + gid(\u001b[34m0\u001b[39;49;00m)) + \u001b[34m1\u001b[39;49;00m + gid(\u001b[34m1\u001b[39;49;00m)] * u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m1\u001b[39;49;00m + gid(\u001b[34m0\u001b[39;49;00m)) + \u001b[34m1\u001b[39;49;00m + gid(\u001b[34m1\u001b[39;49;00m)] + -\u001b[34m1.0f\u001b[39;49;00m + -\u001b[34m4.0f\u001b[39;49;00m * u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m1\u001b[39;49;00m + gid(\u001b[34m0\u001b[39;49;00m)) + \u001b[34m1\u001b[39;49;00m + gid(\u001b[34m1\u001b[39;49;00m)] + u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m2\u001b[39;49;00m + gid(\u001b[34m0\u001b[39;49;00m)) + \u001b[34m1\u001b[39;49;00m + gid(\u001b[34m1\u001b[39;49;00m)] + u[(\u001b[34m2\u001b[39;49;00m + n) * gid(\u001b[34m0\u001b[39;49;00m) + \u001b[34m1\u001b[39;49;00m + gid(\u001b[34m1\u001b[39;49;00m)] + u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m1\u001b[39;49;00m + gid(\u001b[34m0\u001b[39;49;00m)) + \u001b[34m2\u001b[39;49;00m + gid(\u001b[34m1\u001b[39;49;00m)] + u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m1\u001b[39;49;00m + gid(\u001b[34m0\u001b[39;49;00m)) + gid(\u001b[34m1\u001b[39;49;00m)];\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "tknl = knl\n",
    "tknl = lp.tag_inames(tknl, {\"i\": \"g.0\", \"j\": \"g.1\"})\n",
    "evt, (result,) = tknl(queue, u=u)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "But OpenCL/CUDA require blocking to be efficient!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\n",
      "__kernel \u001b[36mvoid\u001b[39;49;00m \u001b[32m__attribute__\u001b[39;49;00m ((reqd_work_group_size(\u001b[34m16\u001b[39;49;00m, \u001b[34m16\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(\u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m n, __global \u001b[36mfloat\u001b[39;49;00m *__restrict__ result, __global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ u)\n",
      "{\n",
      "  \u001b[34mif\u001b[39;49;00m (-\u001b[34m1\u001b[39;49;00m + -\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m && -\u001b[34m1\u001b[39;49;00m + -\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m1\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m)\n",
      "    result[(\u001b[34m1\u001b[39;49;00m + n) * (\u001b[34m1\u001b[39;49;00m + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m1\u001b[39;49;00m)) + \u001b[34m1\u001b[39;49;00m + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] = u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m1\u001b[39;49;00m + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m1\u001b[39;49;00m)) + \u001b[34m1\u001b[39;49;00m + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] * u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m1\u001b[39;49;00m + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m1\u001b[39;49;00m)) + \u001b[34m1\u001b[39;49;00m + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] + -\u001b[34m1.0f\u001b[39;49;00m + -\u001b[34m4.0f\u001b[39;49;00m * u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m1\u001b[39;49;00m + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m1\u001b[39;49;00m)) + \u001b[34m1\u001b[39;49;00m + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] + u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m2\u001b[39;49;00m + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m1\u001b[39;49;00m)) + \u001b[34m1\u001b[39;49;00m + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] + u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m1\u001b[39;49;00m)) + \u001b[34m1\u001b[39;49;00m + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] + u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m1\u001b[39;49;00m + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m1\u001b[39;49;00m)) + \u001b[34m2\u001b[39;49;00m + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] + u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m1\u001b[39;49;00m + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m1\u001b[39;49;00m)) + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sknl = knl\n",
    "sknl = lp.split_iname(sknl,\n",
    "        \"i\", 16, outer_tag=\"g.1\", inner_tag=\"l.1\")\n",
    "sknl = lp.split_iname(sknl,\n",
    "        \"j\", 16, outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
    "evt, (result,) = sknl(queue, u=u)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How about some data reuse?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-23-c550483828df>:6: DeprecationWarning: Not specifying default_tag is deprecated, and default_tag will become mandatory in 2019.x. Pass 'default_tag=\"l.auto\" to match the current default, or Pass 'default_tag=None to leave the loops untagged, which is the recommended behavior.\n",
      "  sknl = lp.add_prefetch(sknl, \"u\",\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\n",
      "__kernel \u001b[36mvoid\u001b[39;49;00m \u001b[32m__attribute__\u001b[39;49;00m ((reqd_work_group_size(\u001b[34m16\u001b[39;49;00m, \u001b[34m16\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(\u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m n, __global \u001b[36mfloat\u001b[39;49;00m *__restrict__ result, __global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ u)\n",
      "{\n",
      "  __local \u001b[36mfloat\u001b[39;49;00m u_fetch[\u001b[34m18\u001b[39;49;00m * \u001b[34m18\u001b[39;49;00m];\n",
      "\n",
      "  \u001b[34mif\u001b[39;49;00m (\u001b[34m1\u001b[39;49;00m + -\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m && \u001b[34m1\u001b[39;49;00m + -\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m1\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m)\n",
      "    \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m u_dim_1_outer = \u001b[34m0\u001b[39;49;00m; u_dim_1_outer <= (-\u001b[34m1\u001b[39;49;00m + n + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m1\u001b[39;49;00m) + -\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) >= \u001b[34m0\u001b[39;49;00m && -\u001b[34m16\u001b[39;49;00m + n + -\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) >= \u001b[34m0\u001b[39;49;00m ? \u001b[34m1\u001b[39;49;00m + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + (\u001b[34m1\u001b[39;49;00m + \u001b[34m15\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m)) / \u001b[34m16\u001b[39;49;00m : -\u001b[34m1\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + (\u001b[34m1\u001b[39;49;00m + n + \u001b[34m15\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m)) / \u001b[34m16\u001b[39;49;00m); ++u_dim_1_outer)\n",
      "      \u001b[34mif\u001b[39;49;00m (\u001b[34m17\u001b[39;49;00m + -\u001b[34m16\u001b[39;49;00m * u_dim_1_outer + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) >= \u001b[34m0\u001b[39;49;00m)\n",
      "        \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m u_dim_0_outer = \u001b[34m0\u001b[39;49;00m; u_dim_0_outer <= (-\u001b[34m16\u001b[39;49;00m + n + -\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) >= \u001b[34m0\u001b[39;49;00m && -\u001b[34m1\u001b[39;49;00m + n + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + -\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) >= \u001b[34m0\u001b[39;49;00m ? \u001b[34m1\u001b[39;49;00m + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m1\u001b[39;49;00m) + (\u001b[34m1\u001b[39;49;00m + \u001b[34m15\u001b[39;49;00m * lid(\u001b[34m1\u001b[39;49;00m)) / \u001b[34m16\u001b[39;49;00m : -\u001b[34m1\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m1\u001b[39;49;00m) + (\u001b[34m1\u001b[39;49;00m + n + \u001b[34m15\u001b[39;49;00m * lid(\u001b[34m1\u001b[39;49;00m)) / \u001b[34m16\u001b[39;49;00m); ++u_dim_0_outer)\n",
      "          \u001b[34mif\u001b[39;49;00m (\u001b[34m17\u001b[39;49;00m + -\u001b[34m16\u001b[39;49;00m * u_dim_0_outer + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m1\u001b[39;49;00m) >= \u001b[34m0\u001b[39;49;00m)\n",
      "            u_fetch[\u001b[34m18\u001b[39;49;00m * (\u001b[34m16\u001b[39;49;00m * u_dim_0_outer + lid(\u001b[34m1\u001b[39;49;00m)) + \u001b[34m16\u001b[39;49;00m * u_dim_1_outer + lid(\u001b[34m0\u001b[39;49;00m)] = u[(\u001b[34m2\u001b[39;49;00m + n) * (\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + \u001b[34m16\u001b[39;49;00m * u_dim_0_outer + lid(\u001b[34m1\u001b[39;49;00m)) + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + \u001b[34m16\u001b[39;49;00m * u_dim_1_outer + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "  barrier(CLK_LOCAL_MEM_FENCE) \u001b[37m/* for u_fetch (insn depends on u_fetch_rule) */\u001b[39;49;00m;\n",
      "  \u001b[34mif\u001b[39;49;00m (-\u001b[34m1\u001b[39;49;00m + -\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m && -\u001b[34m1\u001b[39;49;00m + -\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + -\u001b[34m1\u001b[39;49;00m * lid(\u001b[34m1\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m)\n",
      "    result[(\u001b[34m1\u001b[39;49;00m + n) * (\u001b[34m1\u001b[39;49;00m + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m1\u001b[39;49;00m)) + \u001b[34m1\u001b[39;49;00m + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] = u_fetch[\u001b[34m18\u001b[39;49;00m * (\u001b[34m1\u001b[39;49;00m + lid(\u001b[34m1\u001b[39;49;00m)) + \u001b[34m1\u001b[39;49;00m + lid(\u001b[34m0\u001b[39;49;00m)] * u_fetch[\u001b[34m18\u001b[39;49;00m * (\u001b[34m1\u001b[39;49;00m + lid(\u001b[34m1\u001b[39;49;00m)) + \u001b[34m1\u001b[39;49;00m + lid(\u001b[34m0\u001b[39;49;00m)] + -\u001b[34m1.0f\u001b[39;49;00m + -\u001b[34m4.0f\u001b[39;49;00m * u_fetch[\u001b[34m18\u001b[39;49;00m * (\u001b[34m1\u001b[39;49;00m + lid(\u001b[34m1\u001b[39;49;00m)) + \u001b[34m1\u001b[39;49;00m + lid(\u001b[34m0\u001b[39;49;00m)] + u_fetch[\u001b[34m18\u001b[39;49;00m * (\u001b[34m2\u001b[39;49;00m + lid(\u001b[34m1\u001b[39;49;00m)) + \u001b[34m1\u001b[39;49;00m + lid(\u001b[34m0\u001b[39;49;00m)] + u_fetch[\u001b[34m18\u001b[39;49;00m * lid(\u001b[34m1\u001b[39;49;00m) + \u001b[34m1\u001b[39;49;00m + lid(\u001b[34m0\u001b[39;49;00m)] + u_fetch[\u001b[34m18\u001b[39;49;00m * (\u001b[34m1\u001b[39;49;00m + lid(\u001b[34m1\u001b[39;49;00m)) + \u001b[34m2\u001b[39;49;00m + lid(\u001b[34m0\u001b[39;49;00m)] + u_fetch[\u001b[34m18\u001b[39;49;00m * (\u001b[34m1\u001b[39;49;00m + lid(\u001b[34m1\u001b[39;49;00m)) + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sknl = knl\n",
    "sknl = lp.split_iname(sknl,\n",
    "        \"i\", 16, outer_tag=\"g.1\", inner_tag=\"l.1\")\n",
    "sknl = lp.split_iname(sknl,\n",
    "        \"j\", 16, outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
    "sknl = lp.add_prefetch(sknl, \"u\",\n",
    "    [\"i_inner\", \"j_inner\"],\n",
    "    fetch_bounding_box=True)\n",
    "evt, (result,) = sknl(queue, u=u, n=n)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
